1. Study Background Summary

DDSAnalytics is an analytics company that specializes in talent management solutions for Fortune 100 companies. DDSAnalytics has taken on a contract with Frito Lay to tackle predicting employee turnover. Analyzing an existing employee dataset of 870 unique observations with 36 categories, several visualization and models were generated to determine the top three factors leading to attrition.

2. Importing input data

library(readr)
employeeData <- read.csv("CaseStudy2-data.csv")



# list rows of data that have missing values
employeeData[!complete.cases(employeeData),]
 [1] ID                       Age                      Attrition               
 [4] BusinessTravel           DailyRate                Department              
 [7] DistanceFromHome         Education                EducationField          
[10] EmployeeCount            EmployeeNumber           EnvironmentSatisfaction 
[13] Gender                   HourlyRate               JobInvolvement          
[16] JobLevel                 JobRole                  JobSatisfaction         
[19] MaritalStatus            MonthlyIncome            MonthlyRate             
[22] NumCompaniesWorked       Over18                   OverTime                
[25] PercentSalaryHike        PerformanceRating        RelationshipSatisfaction
[28] StandardHours            StockOptionLevel         TotalWorkingYears       
[31] TrainingTimesLastYear    WorkLifeBalance          YearsAtCompany          
[34] YearsInCurrentRole       YearsSinceLastPromotion  YearsWithCurrManager    
<0 rows> (or 0-length row.names)
#Check the data type of the variables in the file
glimpse(employeeData)
Rows: 870
Columns: 36
$ ID                       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14~
$ Age                      <int> 32, 40, 35, 32, 24, 27, 41, 37, 34, 34, 43, 2~
$ Attrition                <chr> "No", "No", "No", "No", "No", "No", "No", "No~
$ BusinessTravel           <chr> "Travel_Rarely", "Travel_Rarely", "Travel_Fre~
$ DailyRate                <int> 117, 1308, 200, 801, 567, 294, 1283, 309, 133~
$ Department               <chr> "Sales", "Research & Development", "Research ~
$ DistanceFromHome         <int> 13, 14, 18, 1, 2, 10, 5, 10, 10, 10, 6, 1, 7,~
$ Education                <int> 4, 3, 2, 4, 1, 2, 5, 4, 4, 4, 3, 2, 3, 1, 2, ~
$ EducationField           <chr> "Life Sciences", "Medical", "Life Sciences", ~
$ EmployeeCount            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ EmployeeNumber           <int> 859, 1128, 1412, 2016, 1646, 733, 1448, 1105,~
$ EnvironmentSatisfaction  <int> 2, 3, 3, 3, 1, 4, 2, 4, 3, 4, 1, 3, 3, 3, 4, ~
$ Gender                   <chr> "Male", "Male", "Male", "Female", "Female", "~
$ HourlyRate               <int> 73, 44, 60, 48, 32, 32, 90, 88, 87, 92, 81, 4~
$ JobInvolvement           <int> 3, 2, 3, 3, 3, 3, 4, 2, 3, 2, 2, 3, 3, 3, 3, ~
$ JobLevel                 <int> 2, 5, 3, 3, 1, 3, 1, 2, 1, 2, 5, 1, 3, 1, 1, ~
$ JobRole                  <chr> "Sales Executive", "Research Director", "Manu~
$ JobSatisfaction          <int> 4, 3, 4, 4, 4, 1, 3, 4, 3, 3, 3, 4, 3, 2, 1, ~
$ MaritalStatus            <chr> "Divorced", "Single", "Single", "Married", "S~
$ MonthlyIncome            <int> 4403, 19626, 9362, 10422, 3760, 8793, 2127, 6~
$ MonthlyRate              <int> 9250, 17544, 19944, 24032, 17218, 4809, 5561,~
$ NumCompaniesWorked       <int> 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 7, 1, 3, 1, 6, ~
$ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", ~
$ OverTime                 <chr> "No", "No", "No", "No", "Yes", "No", "Yes", "~
$ PercentSalaryHike        <int> 11, 14, 11, 19, 13, 21, 12, 14, 19, 14, 13, 1~
$ PerformanceRating        <int> 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, ~
$ RelationshipSatisfaction <int> 3, 1, 3, 3, 3, 3, 1, 3, 4, 2, 4, 2, 2, 1, 3, ~
$ StandardHours            <int> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8~
$ StockOptionLevel         <int> 1, 0, 0, 2, 0, 2, 0, 3, 1, 1, 0, 1, 0, 1, 0, ~
$ TotalWorkingYears        <int> 8, 21, 10, 14, 6, 9, 7, 8, 1, 8, 21, 3, 17, 1~
$ TrainingTimesLastYear    <int> 3, 2, 2, 3, 2, 4, 5, 5, 2, 3, 2, 2, 3, 3, 3, ~
$ WorkLifeBalance          <int> 2, 4, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, ~
$ YearsAtCompany           <int> 5, 20, 2, 14, 6, 9, 4, 1, 1, 8, 16, 3, 8, 1, ~
$ YearsInCurrentRole       <int> 2, 7, 2, 10, 3, 7, 2, 0, 1, 2, 12, 2, 5, 0, 6~
$ YearsSinceLastPromotion  <int> 0, 4, 2, 5, 1, 1, 0, 0, 0, 7, 6, 2, 1, 0, 5, ~
$ YearsWithCurrManager     <int> 3, 9, 2, 7, 3, 7, 3, 0, 0, 7, 14, 2, 6, 0, 7,~

3.Exploratory Data Analysis

1. Graphs by Job Satistifaction

Display Employees Per Job Satistifaction by Gender

# count and percent
Gender_Job_table = employeeData %>% 
  count(Gender, JobSatisfaction) %>% 
  group_by(Gender) %>% 
  mutate(proportion = n/sum(n))
Gender_Job_table
# A tibble: 8 x 4
# Groups:   Gender [2]
  Gender JobSatisfaction     n proportion
  <chr>            <int> <int>      <dbl>
1 Female               1    71      0.201
2 Female               2    76      0.215
3 Female               3   102      0.288
4 Female               4   105      0.297
5 Male                 1   108      0.209
6 Male                 2    90      0.174
7 Male                 3   152      0.295
8 Male                 4   166      0.322
#Create visulization
JS_Gender = Gender_Job_table %>% 
  ggplot(aes(x = JobSatisfaction, y = proportion, fill = Gender)) +
  geom_col(show.legend = TRUE, position = "dodge")+
  ggtitle("Employees  Job Satistifaction by Gender") + xlab("Job Satisfaction") + ylab("Proportion")
  
# Display Employees Per Job Satistifaction by Gender
ggplotly(JS_Gender)

Display Employees Per Job Satisfaction by Age

 #count and percent
Age_Job_table = employeeData %>% 
  count(Age, JobSatisfaction) %>% 
  group_by(Age) %>% 
  mutate(proportion = n/sum(n))
Age_Job_table
# A tibble: 161 x 4
# Groups:   Age [43]
     Age JobSatisfaction     n proportion
   <int>           <int> <int>      <dbl>
 1    18               2     1      0.167
 2    18               3     3      0.5  
 3    18               4     2      0.333
 4    19               1     1      0.143
 5    19               2     3      0.429
 6    19               3     1      0.143
 7    19               4     2      0.286
 8    20               1     1      0.25 
 9    20               3     1      0.25 
10    20               4     2      0.5  
# ... with 151 more rows
#Create visualization
JS_Age = Age_Job_table %>% 
  ggplot(aes(x = JobSatisfaction, y = proportion, fill = Age)) +
  geom_col(show.legend = TRUE, position = "dodge")+
  ggtitle("Employees  Job Satistifaction by Age") + xlab("Job Satisfaction") + ylab("Proportion")

#Display Employees Per Job Satisfaction by Age
ggplotly(JS_Age)

Display Employees Per Overtime by Attrition

#count and percent
AttrOT_table = employeeData %>% 
  count(Attrition, OverTime) %>% 
  group_by(Attrition) %>% 
  mutate(proportion = n/sum(n))
AttrOT_table
# A tibble: 4 x 4
# Groups:   Attrition [2]
  Attrition OverTime     n proportion
  <chr>     <chr>    <int>      <dbl>
1 No        No         558      0.764
2 No        Yes        172      0.236
3 Yes       No          60      0.429
4 Yes       Yes         80      0.571
#Create visualization
Attr_OT = AttrOT_table %>% 
  ggplot(aes(x = OverTime, y = proportion, fill = Attrition)) +
  geom_col(show.legend = TRUE, position = "dodge")+
  ggtitle("Employees  Overtime by Attrition") + xlab("Overtime") + ylab("Proportion")

#Display Employees Per Overtime by Attrition
ggplotly(Attr_OT)

Display Employees Attrition By Job Role

#count and percent
AttrJob_table = employeeData %>% 
  count(JobRole,Attrition) %>% 
  group_by(JobRole) %>% 
  mutate(proportion = n/sum(n))
AttrJob_table
# A tibble: 18 x 4
# Groups:   JobRole [9]
   JobRole                   Attrition     n proportion
   <chr>                     <chr>     <int>      <dbl>
 1 Healthcare Representative No           68     0.895 
 2 Healthcare Representative Yes           8     0.105 
 3 Human Resources           No           21     0.778 
 4 Human Resources           Yes           6     0.222 
 5 Laboratory Technician     No          123     0.804 
 6 Laboratory Technician     Yes          30     0.196 
 7 Manager                   No           47     0.922 
 8 Manager                   Yes           4     0.0784
 9 Manufacturing Director    No           85     0.977 
10 Manufacturing Director    Yes           2     0.0230
11 Research Director         No           50     0.980 
12 Research Director         Yes           1     0.0196
13 Research Scientist        No          140     0.814 
14 Research Scientist        Yes          32     0.186 
15 Sales Executive           No          167     0.835 
16 Sales Executive           Yes          33     0.165 
17 Sales Representative      No           29     0.547 
18 Sales Representative      Yes          24     0.453 
# Create visualization 
Attr_Job = AttrJob_table %>% 
  ggplot(aes(x = Attrition, y = proportion, fill = JobRole)) +
  geom_col(show.legend = TRUE, position = "dodge")+
  ggtitle("Employees Attrition by Job Role") + xlab("Job Role") + ylab("Proportion")

#Display Employees Attrition By Job Role
ggplotly(Attr_Job)

Display Employees Overtime By Job Role

#count and percent
OTJob_table = employeeData %>% 
  count(JobRole, OverTime) %>% 
  group_by(JobRole) %>% 
  mutate(proportion = n/sum(n))
OTJob_table
# A tibble: 18 x 4
# Groups:   JobRole [9]
   JobRole                   OverTime     n proportion
   <chr>                     <chr>    <int>      <dbl>
 1 Healthcare Representative No          54      0.711
 2 Healthcare Representative Yes         22      0.289
 3 Human Resources           No          21      0.778
 4 Human Resources           Yes          6      0.222
 5 Laboratory Technician     No         120      0.784
 6 Laboratory Technician     Yes         33      0.216
 7 Manager                   No          41      0.804
 8 Manager                   Yes         10      0.196
 9 Manufacturing Director    No          64      0.736
10 Manufacturing Director    Yes         23      0.264
11 Research Director         No          35      0.686
12 Research Director         Yes         16      0.314
13 Research Scientist        No         107      0.622
14 Research Scientist        Yes         65      0.378
15 Sales Executive           No         141      0.705
16 Sales Executive           Yes         59      0.295
17 Sales Representative      No          35      0.660
18 Sales Representative      Yes         18      0.340
#Create Visualization
OTJob = OTJob_table %>% 
  ggplot(aes(x = OverTime, y = proportion, fill = JobRole)) +
  geom_col(show.legend = TRUE, position = "dodge")+
  ggtitle("Employees  Overtime by Job Role") + xlab("Overtime") + ylab("Proportion")

#Display Employees Overtime By Job Role
ggplotly(OTJob)

Display Employees Attrition By Age

#count and percent
AttrAge_table = employeeData %>% 
  count(Age, Attrition) %>% 
  group_by(Age) %>% 
  mutate(proportion = n/sum(n))
AttrAge_table
# A tibble: 81 x 4
# Groups:   Age [43]
     Age Attrition     n proportion
   <int> <chr>     <int>      <dbl>
 1    18 No            2      0.333
 2    18 Yes           4      0.667
 3    19 No            3      0.429
 4    19 Yes           4      0.571
 5    20 No            1      0.25 
 6    20 Yes           3      0.75 
 7    21 No            4      0.667
 8    21 Yes           2      0.333
 9    22 No            5      0.625
10    22 Yes           3      0.375
# ... with 71 more rows
#Create visualization 
AttrAge = AttrAge_table %>% 
  ggplot(aes(x = Attrition, y = proportion, fill = Age)) +
  geom_col(show.legend = TRUE, position = "dodge")+
  ggtitle("Employees  Attrition By Age") + xlab("Attrition") + ylab("Proportion")

#Display Employees Attrition By Age
ggplotly(AttrAge)

Display Employees Per Attrtiton by Monthly Income

# count and percent
Attr_MI_table = employeeData %>% 
  count(MonthlyIncome, Attrition) %>% 
  group_by(MonthlyIncome) %>% 
  mutate(proportion = n/sum(n))
Attr_MI_table
# A tibble: 839 x 4
# Groups:   MonthlyIncome [826]
   MonthlyIncome Attrition     n proportion
           <int> <chr>     <int>      <dbl>
 1          1081 Yes           1          1
 2          1091 Yes           1          1
 3          1102 Yes           1          1
 4          1118 Yes           1          1
 5          1129 No            1          1
 6          1223 No            1          1
 7          1274 No            1          1
 8          1281 No            1          1
 9          1393 Yes           1          1
10          1420 Yes           1          1
# ... with 829 more rows
# visualization
Attr_MI = Attr_MI_table %>% 
  ggplot(aes(x = Attrition, y = proportion, fill = MonthlyIncome)) +
  geom_col(show.legend = TRUE, position = "dodge")+
  ggtitle("Employees  Attrition by Monthly Income") + xlab("Attrition") + ylab("Proportion")
  
#Display Employees Per Attrition by Monthly Income
ggplotly(Attr_MI)

4. Modeling

Model 1: Naive Bayes

# data prep
df = employeeData
summary(df)
       ID             Age         Attrition         BusinessTravel    
 Min.   :  1.0   Min.   :18.00   Length:870         Length:870        
 1st Qu.:218.2   1st Qu.:30.00   Class :character   Class :character  
 Median :435.5   Median :35.00   Mode  :character   Mode  :character  
 Mean   :435.5   Mean   :36.83                                        
 3rd Qu.:652.8   3rd Qu.:43.00                                        
 Max.   :870.0   Max.   :60.00                                        
   DailyRate       Department        DistanceFromHome   Education    
 Min.   : 103.0   Length:870         Min.   : 1.000   Min.   :1.000  
 1st Qu.: 472.5   Class :character   1st Qu.: 2.000   1st Qu.:2.000  
 Median : 817.5   Mode  :character   Median : 7.000   Median :3.000  
 Mean   : 815.2                      Mean   : 9.339   Mean   :2.901  
 3rd Qu.:1165.8                      3rd Qu.:14.000   3rd Qu.:4.000  
 Max.   :1499.0                      Max.   :29.000   Max.   :5.000  
 EducationField     EmployeeCount EmployeeNumber   EnvironmentSatisfaction
 Length:870         Min.   :1     Min.   :   1.0   Min.   :1.000          
 Class :character   1st Qu.:1     1st Qu.: 477.2   1st Qu.:2.000          
 Mode  :character   Median :1     Median :1039.0   Median :3.000          
                    Mean   :1     Mean   :1029.8   Mean   :2.701          
                    3rd Qu.:1     3rd Qu.:1561.5   3rd Qu.:4.000          
                    Max.   :1     Max.   :2064.0   Max.   :4.000          
    Gender            HourlyRate     JobInvolvement     JobLevel    
 Length:870         Min.   : 30.00   Min.   :1.000   Min.   :1.000  
 Class :character   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
 Mode  :character   Median : 66.00   Median :3.000   Median :2.000  
                    Mean   : 65.61   Mean   :2.723   Mean   :2.039  
                    3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
                    Max.   :100.00   Max.   :4.000   Max.   :5.000  
   JobRole          JobSatisfaction MaritalStatus      MonthlyIncome  
 Length:870         Min.   :1.000   Length:870         Min.   : 1081  
 Class :character   1st Qu.:2.000   Class :character   1st Qu.: 2840  
 Mode  :character   Median :3.000   Mode  :character   Median : 4946  
                    Mean   :2.709                      Mean   : 6390  
                    3rd Qu.:4.000                      3rd Qu.: 8182  
                    Max.   :4.000                      Max.   :19999  
  MonthlyRate    NumCompaniesWorked    Over18            OverTime        
 Min.   : 2094   Min.   :0.000      Length:870         Length:870        
 1st Qu.: 8092   1st Qu.:1.000      Class :character   Class :character  
 Median :14074   Median :2.000      Mode  :character   Mode  :character  
 Mean   :14326   Mean   :2.728                                           
 3rd Qu.:20456   3rd Qu.:4.000                                           
 Max.   :26997   Max.   :9.000                                           
 PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours
 Min.   :11.0      Min.   :3.000     Min.   :1.000            Min.   :80   
 1st Qu.:12.0      1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80   
 Median :14.0      Median :3.000     Median :3.000            Median :80   
 Mean   :15.2      Mean   :3.152     Mean   :2.707            Mean   :80   
 3rd Qu.:18.0      3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80   
 Max.   :25.0      Max.   :4.000     Max.   :4.000            Max.   :80   
 StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
 Min.   :0.0000   Min.   : 0.00     Min.   :0.000         Min.   :1.000  
 1st Qu.:0.0000   1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000  
 Median :1.0000   Median :10.00     Median :3.000         Median :3.000  
 Mean   :0.7839   Mean   :11.05     Mean   :2.832         Mean   :2.782  
 3rd Qu.:1.0000   3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000  
 Max.   :3.0000   Max.   :40.00     Max.   :6.000         Max.   :4.000  
 YearsAtCompany   YearsInCurrentRole YearsSinceLastPromotion
 Min.   : 0.000   Min.   : 0.000     Min.   : 0.000         
 1st Qu.: 3.000   1st Qu.: 2.000     1st Qu.: 0.000         
 Median : 5.000   Median : 3.000     Median : 1.000         
 Mean   : 6.962   Mean   : 4.205     Mean   : 2.169         
 3rd Qu.:10.000   3rd Qu.: 7.000     3rd Qu.: 3.000         
 Max.   :40.000   Max.   :18.000     Max.   :15.000         
 YearsWithCurrManager
 Min.   : 0.00       
 1st Qu.: 2.00       
 Median : 3.00       
 Mean   : 4.14       
 3rd Qu.: 7.00       
 Max.   :17.00       
df = df %>% 
  dplyr::select(-ID, - EmployeeCount, - EmployeeNumber, -Over18)  %>% 
  mutate_if(is.character, factor)


## Create train and test sets
set.seed(120)
trainIndices = sample(seq(1, nrow(df), by = 1),(.7*nrow(df)))
trainData = df[trainIndices,]
testData = df[-trainIndices,]

# Naive Bayes model
model.nb = naiveBayes(Attrition ~ ., data = df, positive = "Yes")

# model summary
summary(model.nb )
          Length Class  Mode     
apriori    2     table  numeric  
tables    31     -none- list     
levels     2     -none- character
isnumeric 31     -none- logical  
call       5     -none- call     
model.nb$apriori
Y
 No Yes 
730 140 
# Prediction
predData  = predict(model.nb, testData)
table(predData)
predData
 No Yes 
214  47 
predData  = factor(predData)

# Accuracy
confusionMatrix(predData,testData$Attrition, positive = "Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  202  12
       Yes  22  25
                                          
               Accuracy : 0.8697          
                 95% CI : (0.8227, 0.9081)
    No Information Rate : 0.8582          
    P-Value [Acc > NIR] : 0.3349          
                                          
                  Kappa : 0.5189          
                                          
 Mcnemar's Test P-Value : 0.1227          
                                          
            Sensitivity : 0.67568         
            Specificity : 0.90179         
         Pos Pred Value : 0.53191         
         Neg Pred Value : 0.94393         
             Prevalence : 0.14176         
         Detection Rate : 0.09579         
   Detection Prevalence : 0.18008         
      Balanced Accuracy : 0.78873         
                                          
       'Positive' Class : Yes             
                                          

The model requirements were atleast a 60% sensitivity and specificity for the training and validation set. The model met these requirements with a 86.97% accuracy, 67.57% sensitivity and 90.18%.

# data prep
df = employeeData
summary(df)
       ID             Age         Attrition         BusinessTravel    
 Min.   :  1.0   Min.   :18.00   Length:870         Length:870        
 1st Qu.:218.2   1st Qu.:30.00   Class :character   Class :character  
 Median :435.5   Median :35.00   Mode  :character   Mode  :character  
 Mean   :435.5   Mean   :36.83                                        
 3rd Qu.:652.8   3rd Qu.:43.00                                        
 Max.   :870.0   Max.   :60.00                                        
   DailyRate       Department        DistanceFromHome   Education    
 Min.   : 103.0   Length:870         Min.   : 1.000   Min.   :1.000  
 1st Qu.: 472.5   Class :character   1st Qu.: 2.000   1st Qu.:2.000  
 Median : 817.5   Mode  :character   Median : 7.000   Median :3.000  
 Mean   : 815.2                      Mean   : 9.339   Mean   :2.901  
 3rd Qu.:1165.8                      3rd Qu.:14.000   3rd Qu.:4.000  
 Max.   :1499.0                      Max.   :29.000   Max.   :5.000  
 EducationField     EmployeeCount EmployeeNumber   EnvironmentSatisfaction
 Length:870         Min.   :1     Min.   :   1.0   Min.   :1.000          
 Class :character   1st Qu.:1     1st Qu.: 477.2   1st Qu.:2.000          
 Mode  :character   Median :1     Median :1039.0   Median :3.000          
                    Mean   :1     Mean   :1029.8   Mean   :2.701          
                    3rd Qu.:1     3rd Qu.:1561.5   3rd Qu.:4.000          
                    Max.   :1     Max.   :2064.0   Max.   :4.000          
    Gender            HourlyRate     JobInvolvement     JobLevel    
 Length:870         Min.   : 30.00   Min.   :1.000   Min.   :1.000  
 Class :character   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
 Mode  :character   Median : 66.00   Median :3.000   Median :2.000  
                    Mean   : 65.61   Mean   :2.723   Mean   :2.039  
                    3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
                    Max.   :100.00   Max.   :4.000   Max.   :5.000  
   JobRole          JobSatisfaction MaritalStatus      MonthlyIncome  
 Length:870         Min.   :1.000   Length:870         Min.   : 1081  
 Class :character   1st Qu.:2.000   Class :character   1st Qu.: 2840  
 Mode  :character   Median :3.000   Mode  :character   Median : 4946  
                    Mean   :2.709                      Mean   : 6390  
                    3rd Qu.:4.000                      3rd Qu.: 8182  
                    Max.   :4.000                      Max.   :19999  
  MonthlyRate    NumCompaniesWorked    Over18            OverTime        
 Min.   : 2094   Min.   :0.000      Length:870         Length:870        
 1st Qu.: 8092   1st Qu.:1.000      Class :character   Class :character  
 Median :14074   Median :2.000      Mode  :character   Mode  :character  
 Mean   :14326   Mean   :2.728                                           
 3rd Qu.:20456   3rd Qu.:4.000                                           
 Max.   :26997   Max.   :9.000                                           
 PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours
 Min.   :11.0      Min.   :3.000     Min.   :1.000            Min.   :80   
 1st Qu.:12.0      1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80   
 Median :14.0      Median :3.000     Median :3.000            Median :80   
 Mean   :15.2      Mean   :3.152     Mean   :2.707            Mean   :80   
 3rd Qu.:18.0      3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80   
 Max.   :25.0      Max.   :4.000     Max.   :4.000            Max.   :80   
 StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
 Min.   :0.0000   Min.   : 0.00     Min.   :0.000         Min.   :1.000  
 1st Qu.:0.0000   1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000  
 Median :1.0000   Median :10.00     Median :3.000         Median :3.000  
 Mean   :0.7839   Mean   :11.05     Mean   :2.832         Mean   :2.782  
 3rd Qu.:1.0000   3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000  
 Max.   :3.0000   Max.   :40.00     Max.   :6.000         Max.   :4.000  
 YearsAtCompany   YearsInCurrentRole YearsSinceLastPromotion
 Min.   : 0.000   Min.   : 0.000     Min.   : 0.000         
 1st Qu.: 3.000   1st Qu.: 2.000     1st Qu.: 0.000         
 Median : 5.000   Median : 3.000     Median : 1.000         
 Mean   : 6.962   Mean   : 4.205     Mean   : 2.169         
 3rd Qu.:10.000   3rd Qu.: 7.000     3rd Qu.: 3.000         
 Max.   :40.000   Max.   :18.000     Max.   :15.000         
 YearsWithCurrManager
 Min.   : 0.00       
 1st Qu.: 2.00       
 Median : 3.00       
 Mean   : 4.14       
 3rd Qu.: 7.00       
 Max.   :17.00       
df = df %>% 
  dplyr::select(-ID, - EmployeeCount, - EmployeeNumber, -Over18)  %>% 
  mutate_if(is.character, factor)

# Random Forest method


model.rf = randomForest(Attrition ~ ., ntree = 100, keep.forest=FALSE,
                  data = df,
                   importance = TRUE)
model.rf

Call:
 randomForest(formula = Attrition ~ ., data = df, ntree = 100,      keep.forest = FALSE, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 100
No. of variables tried at each split: 5

        OOB estimate of  error rate: 14.25%
Confusion matrix:
     No Yes class.error
No  720  10  0.01369863
Yes 114  26  0.81428571
# Importance Variable Plot

Imp_Var = varImp(model.rf)
(varImpPlot(model.rf))

                         MeanDecreaseAccuracy MeanDecreaseGini
Age                                 3.5973985        13.376111
BusinessTravel                     -0.5630967         3.023952
DailyRate                           1.5883593        11.544679
Department                          2.4159246         2.293979
DistanceFromHome                    1.6180845        11.466528
Education                           1.7145657         4.719197
EducationField                      1.5976790         8.391209
EnvironmentSatisfaction             1.6543868         6.055542
Gender                              0.3923474         1.562547
HourlyRate                         -0.2347427        10.024692
JobInvolvement                      3.7416241         8.373097
JobLevel                            4.0980405         4.368232
JobRole                             4.5790442        13.026309
JobSatisfaction                    -0.5504987         5.973957
MaritalStatus                       4.9904910         5.336642
MonthlyIncome                       6.5021419        18.317048
MonthlyRate                        -2.0438151        11.147896
NumCompaniesWorked                  2.4167839         7.606961
OverTime                            8.8362658        13.257364
PercentSalaryHike                   2.0237277         8.890304
PerformanceRating                   0.2388280         1.180633
RelationshipSatisfaction           -0.7918954         5.370032
StandardHours                       0.0000000         0.000000
StockOptionLevel                    5.4565358         8.810542
TotalWorkingYears                   4.5280657        11.792019
TrainingTimesLastYear               0.8565250         5.792272
WorkLifeBalance                     1.5585349         6.054656
YearsAtCompany                      3.5529660         9.731998
YearsInCurrentRole                  2.3545268         5.611045
YearsSinceLastPromotion             2.8582968         6.120367
YearsWithCurrManager                4.6991491         6.495443
## Load the validation data set that does not include Attrition


valData = read.csv("CaseStudy2CompSet No Attrition.csv")
#View(valData)

## Export the predicted results from my model into a .csv file for submission
validationPrediction <- predict(model.nb, valData) 
table(validationPrediction)
validationPrediction
 No Yes 
249  51 
# create output dataset
output = valData %>% select(ID)
output = output %>% 
  mutate(Attrition = validationPrediction)


# save dataset as a csv
write.csv(output, file = "Case2PredictionsDHerring Attrition.csv", row.names = FALSE)

Based on the model to the right, it may be predicted that the top three contributing factors to attrition at Frito Lay are Monthly Income, Overtime and Age.

Model 2

## Run a linear regression with the cleaned data set with Monthly Income for Validation Requirement
fitMonthlyIncome = lm(MonthlyIncome ~ ., data = df)
summary(fitMonthlyIncome)

Call:
lm(formula = MonthlyIncome ~ ., data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-3680.7  -660.4     7.4   625.3  4114.4 

Coefficients: (1 not defined because of singularities)
                                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)                       6.322e+01  7.725e+02   0.082  0.93479    
Age                              -1.430e+00  5.659e+00  -0.253  0.80049    
AttritionYes                      8.245e+01  1.156e+02   0.714  0.47573    
BusinessTravelTravel_Frequently   1.956e+02  1.422e+02   1.375  0.16950    
BusinessTravelTravel_Rarely       3.777e+02  1.202e+02   3.143  0.00173 ** 
DailyRate                         1.449e-01  9.138e-02   1.586  0.11312    
DepartmentResearch & Development  1.205e+02  4.774e+02   0.252  0.80083    
DepartmentSales                  -4.485e+02  4.885e+02  -0.918  0.35883    
DistanceFromHome                 -6.712e+00  4.577e+00  -1.466  0.14290    
Education                        -3.377e+01  3.718e+01  -0.908  0.36398    
EducationFieldLife Sciences       1.294e+02  3.695e+02   0.350  0.72633    
EducationFieldMarketing           1.039e+02  3.915e+02   0.266  0.79067    
EducationFieldMedical             1.976e+01  3.704e+02   0.053  0.95746    
EducationFieldOther               7.569e+01  3.952e+02   0.192  0.84816    
EducationFieldTechnical Degree    8.523e+01  3.848e+02   0.221  0.82476    
EnvironmentSatisfaction          -4.545e+00  3.369e+01  -0.135  0.89271    
GenderMale                        1.112e+02  7.454e+01   1.492  0.13606    
HourlyRate                       -3.812e-01  1.827e+00  -0.209  0.83478    
JobInvolvement                    1.807e+01  5.328e+01   0.339  0.73450    
JobLevel                          2.786e+03  8.353e+01  33.356  < 2e-16 ***
JobRoleHuman Resources           -2.054e+02  5.156e+02  -0.398  0.69052    
JobRoleLaboratory Technician     -6.021e+02  1.715e+02  -3.512  0.00047 ***
JobRoleManager                    4.280e+03  2.835e+02  15.099  < 2e-16 ***
JobRoleManufacturing Director     1.742e+02  1.697e+02   1.027  0.30480    
JobRoleResearch Director          4.056e+03  2.193e+02  18.489  < 2e-16 ***
JobRoleResearch Scientist        -3.482e+02  1.704e+02  -2.043  0.04135 *  
JobRoleSales Executive            5.179e+02  3.579e+02   1.447  0.14830    
JobRoleSales Representative       8.120e+01  3.923e+02   0.207  0.83605    
JobSatisfaction                   2.736e+01  3.339e+01   0.819  0.41288    
MaritalStatusMarried              6.666e+01  1.001e+02   0.666  0.50555    
MaritalStatusSingle               1.520e+01  1.355e+02   0.112  0.91072    
MonthlyRate                      -9.243e-03  5.148e-03  -1.796  0.07294 .  
NumCompaniesWorked                4.915e+00  1.693e+01   0.290  0.77164    
OverTimeYes                      -1.536e+01  8.446e+01  -0.182  0.85577    
PercentSalaryHike                 2.520e+01  1.583e+01   1.592  0.11187    
PerformanceRating                -3.247e+02  1.617e+02  -2.008  0.04494 *  
RelationshipSatisfaction          1.621e+01  3.331e+01   0.487  0.62665    
StandardHours                            NA         NA      NA       NA    
StockOptionLevel                  4.062e+00  5.695e+01   0.071  0.94316    
TotalWorkingYears                 5.124e+01  1.099e+01   4.661 3.66e-06 ***
TrainingTimesLastYear             2.375e+01  2.917e+01   0.814  0.41574    
WorkLifeBalance                  -3.616e+01  5.169e+01  -0.700  0.48441    
YearsAtCompany                   -4.709e+00  1.363e+01  -0.345  0.72990    
YearsInCurrentRole                5.629e+00  1.703e+01   0.330  0.74111    
YearsSinceLastPromotion           3.048e+01  1.534e+01   1.987  0.04723 *  
YearsWithCurrManager             -2.576e+01  1.670e+01  -1.542  0.12341    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1057 on 825 degrees of freedom
Multiple R-squared:  0.9498,    Adjusted R-squared:  0.9471 
F-statistic: 354.9 on 44 and 825 DF,  p-value: < 2.2e-16
## Load the validation data set that does not include MonthlyIncome


valData.MI = read_excel("CaseStudy2CompSet No Salary.xlsx")
## Export the predicted results from my model into a .csv file for submission
validationPrediction.MI = predict(fitMonthlyIncome, valData.MI) 


output2 = valData.MI %>% select(ID)
output2 = output2 %>% 
  mutate(MonthlyIncome = validationPrediction.MI)

write.csv(output2, file = "Case2PredictionsDHerring Salary.csv", row.names = FALSE)

The model met the required RMSE of less than $3,000 with a RMSE of $1,057.